import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import qgrid # for interactive dataframe view
from pycaret.classification import *
sns.set_style("ticks")
# custom function define
## for EDA - Binning | Counts | Ratio : Now this is not required function, Look Sweetviz results~!
def ratio_plot(dataset0,Y_Cols):
cols_float = dataset0.dtypes[dataset0.dtypes == float].index
cols_object = dataset0.dtypes[dataset0.dtypes == object].index
ratio_df_list = []
dataset_bin = dataset0.copy()
for col_i in dataset0.columns[:-1]:
if col_i in cols_float:
dataset_bin[col_i] = pd.qcut(dataset[col_i],q=5,duplicates='drop')
rst_df = pd.DataFrame()
cnt = dataset_bin.groupby(col_i)[Y_Cols].count()
sum_1 = dataset_bin.groupby(col_i)[Y_Cols].sum()
rst_df = pd.concat((cnt,sum_1),axis=1)
rst_df.columns = ['count','sum']
rst_df['ratio'] = rst_df['sum']/rst_df['count']
ratio_df_list.append(rst_df)
for df_ratio_i in ratio_df_list:
labels = [str(i) for i in df_ratio_i.index]
fig, ax = plt.subplots()
ax.bar(labels, df_ratio_i[df_ratio_i.columns[0]],label='ALL')
ax.bar(labels, df_ratio_i[df_ratio_i.columns[1]],label=f"{Y_Cols}")
ax.set_ylabel(f"Count All and {Y_Cols}",color="black",fontsize=14)
ax.set_xlabel(df_ratio_i.index.name)
ax.set_xticks(labels)
ax2=ax.twinx()
ax2.plot(labels,df_ratio_i[df_ratio_i.columns[2]],color="blue",marker="o")
ax2.set_ylabel("Ratio",color="blue",fontsize=14)
ylim1 = ax.get_ylim()
len1 = ylim1[1]-ylim1[0]
yticks1 = ax.get_yticks()
rel_dist = [(y-ylim1[0])/len1 for y in yticks1]
ylim2 = ax2.get_ylim()
len2 = ylim2[1]-ylim2[0]
yticks2 = [ry*len2+ylim2[0] for ry in rel_dist]
ax2.set_yticks(yticks2)
ax2.set_ylim(ylim2) #<-- this line is needed to re-adjust the limits to the original values
# Just add a title and rotate the x-axis labels to be horizontal.
plt.title(f'Counts and Ratio for {df_ratio_i.index.name}')
plt.xticks(rotation=90, ha='center')
plt.show()
## Feature Importance - Tree|MDI based Classifica Feature Imp. VS. permutation Feature Imp.
def FI(model,dataset,X_Cols,Y_Cols):
X_names = np.array(X_Cols)
fig = plt.figure()
try:
model_FI = model.feature_importances_ #tuned_model
sorted_idx = model_FI.argsort()
y_ticks = np.arange(0, len(X_names))
ax = plt.subplot(121)
ax.barh(y_ticks, model_FI[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_yticklabels(X_names[sorted_idx])
ax.set_title("Feature Importances (MDI:Mean Decrease in Impurity")
except:
print("FI failed")
pass
from sklearn.inspection import permutation_importance
try:
result = permutation_importance(
model,
dataset[dataset.isna().sum(axis=1) == 0][X_names],
dataset[dataset.isna().sum(axis=1) == 0][Y_Cols],
n_repeats=10,
random_state=42,
n_jobs=2)
sorted_idx = result.importances_mean.argsort()
ax = plt.subplot(122)
ax.boxplot(result.importances[sorted_idx].T,
vert=False,
labels=X_names[sorted_idx])
ax.set_title("Permutation Importances (train set)")
except:
print("PFI failed")
pass
fig.tight_layout()
fig.show()
train = pd.read_csv('./input/titanic/train.csv')
test = pd.read_csv('./input/titanic/test.csv')
sub = pd.read_csv('./input/titanic/gender_submission.csv')
# add random variable to interpret Feature Importance
rng = np.random.RandomState(seed=42)
train["random_cat"] = rng.randint(3, size=train.shape[0])
train["random_num"] = rng.randn(train.shape[0])
test["random_cat"] = rng.randint(3, size=test.shape[0])
test["random_num"] = rng.randn(test.shape[0])
qgrid.show_grid(train, grid_options={'maxVisibleRows': 5})
import sweetviz as sv
comparison_report = sv.compare([train,'Train'], [test,'Test'], target_feat='Survived')
comparison_report.show_notebook(scale=0.8)
X_Cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_Cols_categ = ['Sex', 'Pclass']
X_Cols_Rand = ['random_cat','random_num']
Y_Cols = ['Survived']
dataset0 = train[X_Cols + X_Cols_Rand + Y_Cols]
dataset = dataset0.copy()
cols_float = dataset0.dtypes[dataset0.dtypes != float].index
cols_object = dataset0.dtypes[dataset0.dtypes == object].index
dataset.Sex = dataset.Sex.replace({'male': 0, 'female': 1})
dataset.Embarked = dataset.Embarked.replace({'S': 0, 'Q': 1, 'C': 2})
for col_i in dataset.columns:
if col_i in cols_float:
dataset[col_i] = dataset[col_i].astype(float)
qgrid.show_grid(dataset, grid_options={'maxVisibleRows': 5})
intra_report = sv.compare_intra(train, train["Sex"] == 'male', ["Male", "Female"], 'Survived')
intra_report.show_notebook(w=900, h=450, scale=0.8)
sns.pairplot(dataset, hue='Sex',corner=True,
palette='husl',kind='reg',diag_kws={'bw': 0.2})
# One-Hot Encoded
'''
clf1 = setup(data = train,
#test_data = test,
#normalize = True,
target = 'Survived',
remove_outliers = True,
numeric_imputation = 'mean',
categorical_features = ['Sex','Embarked'],
ignore_features = ['Name','Ticket','Cabin'],
fold_shuffle=True, session_id=2,
imputation_type='iterative',
silent = True)
'''
#Categorical values are well ordered using integer
clf1 = setup(data = dataset,
#test_data = test,
#normalize = True,
target = 'Survived',
remove_outliers = True,
numeric_imputation = 'mean',
#fold_shuffle=True, session_id=2,
imputation_type='iterative',
silent = True)
model_top18 = compare_models(n_select=18,exclude=['gpc'],turbo=False)
model_top3 = model_top18[:3]
blended_l = blend_models(estimator_list = model_top3, fold = 5, optimize = 'AUC')
stacked_I = stack_models(estimator_list = model_top3, fold = 5, optimize = 'AUC')
#model = create_model('lightgbm')
model = model_top3[0]
#model = model_top3[1]
tuned_model = tune_model(model)
tuned_model_df = pull(tuned_model)
evaluate_model(tuned_model)
sns.set(font_scale=1.0)
corr = dataset.corr()
g = sns.clustermap(data=corr,
square=True,
annot=True,
cbar=True,
center=0,
vmin=-1,
vmax=1,
figsize=(8, 6),
annot_kws={"size": 8},
cmap="coolwarm")
- survived : 생존=1, 죽음=0
- pclass : 승객 등급. 1등급=1, 2등급=2, 3등급=3
- sibsp : 함께 탑승한 형제 또는 배우자 수
- parch : 함께 탑승한 부모 또는 자녀 수
- ticket : 티켓 번호
- cabin : 선실 번호
- embarked : 탑승장소 S=Southhampton, C=Cherbourg, Q=Queenstown
We can easily see X-Y similarity using correlation matrix and dendrogram
Y 인자 ("Survived") 기준, Best Correlation "Gender", "SibSp", "P_Class" 순
X 인자들 기준
train_data_using_model = pd.concat((get_config("X_train"),get_config("y_train")),axis=1)
FI(model_top3[0], train_data_using_model, get_config("X_train").columns, Y_Cols)
FI(stacked_I, train_data_using_model, get_config("X_train").columns, Y_Cols)
#target_plot (모델과 상관없이 사용 데이터) 대비 모델의 실제 결과를 체크
from pdpbox import pdp, get_dataset, info_plots
for col_i in get_config("X_train").columns:
fig, axes, summary_df = info_plots.target_plot(df=train_data_using_model,
feature=col_i,
figsize=(6,4),
feature_name=col_i,
target=Y_Cols)
fig, axes, summary_df = info_plots.actual_plot(model=tuned_model,
X=get_config("X_train"),
figsize=(6,4),
feature=col_i,
feature_name=col_i)
PDP and ICE shows
for col_i in get_config("X_train").columns:
pdp_i = pdp.pdp_isolate(model=tuned_model,
dataset=train_data_using_model,
model_features=get_config("X_train").columns,
feature=col_i)
fig, axes = pdp.pdp_plot(
pdp_isolate_out=pdp_i,
plot_lines=True, # True : ICE - individual curve, False : PDP with ICE range
center=True,
#plot_pts_dist=True,
x_quantile=True,
show_percentile=True,
figsize=(8, 6),
ncols=2,
feature_name=col_i)
# note datasize and interactions
fig, axes, summary_df = info_plots.target_plot_interact(
df=train_data_using_model, features=['Age', 'Fare'], feature_names=['Age', 'Fare'], target=Y_Cols
)
fig, axes, summary_df = info_plots.actual_plot_interact(
model = tuned_model, X=get_config("X_train"), features=['Age', 'Fare'], feature_names=['Age', 'Fare']
)
inter1 = pdp.pdp_interact(model=tuned_model,
dataset=get_config("X_train"),
model_features=get_config("X_train").columns,
features=['Age', 'Fare'])
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
feature_names=['age', 'fare'],
figsize=(8, 9),
plot_type= 'grid', #'contour'
x_quantile=True,
plot_pdp=True)
# Note target_plot is traditional EDA view !!
ratio_plot(dataset0,Y_Cols)
#interpret_model(tuned_model)
#interpret_model(tuned_model, plot='correlation',kwargs={vmin})
import shap
# for normal cases
explainer = shap.Explainer(tuned_model)
expected_value = explainer.expected_value
shap_values = explainer(get_config("X_train"))
expected_value
# summarize the effects of all the features
sns.set(font_scale=1.0)
plt.figure()
plt.subplot(1, 3, 1)
shap.plots.bar(shap_values, show=False)
plt.subplot(1, 3, 2)
shap.plots.beeswarm(shap_values, show=False, alpha=0.5)
plt.yticks([])
plt.subplot(1, 3, 3)
shap.decision_plot(explainer.expected_value,
shap_values.values,
get_config("X_train"),
link='logit',
show=False)
plt.yticks([])
plt.tight_layout()
#shap.summary_plot(shap_values, get_config("X_train"))
Global Feature 관점에서 PFI와 동일하게 Gender/P_Class, Fare/ Age 순서로 Y에 영향을 주는 형태로 나타나고 있다. Local 관점에서 Gender 에 따라 극명하게 결과가 나뉘는(분포가 분리되어 있는) 형태임을 볼 수 있고, Pclass, Fare 에 대해서도 꽤 분리도가 높게 나타난다. 즉, 개별 data 또한 Gender/Pclass/Fare 의 global 경향(방향성)을 따른다는 것을 알 수 있다.
selection = 1
shap.initjs()
shap.force_plot(explainer.expected_value,
shap_values.values[selection, :],
get_config("X_train").iloc[selection, :],
link='logit',
#matplotlib=True,
show=False)
plt.figure(figsize=(8,6))
plt.subplot(1, 2, 1)
shap.decision_plot(explainer.expected_value,
shap_values.values[selection, :],
get_config("X_train").iloc[selection, :],
link='logit',
show=False)
plt.yticks([])
plt.subplot(1, 2, 2)
shap.plots.waterfall(shap_values[selection], show=False)
plt.tight_layout()
아래 interprete_model 을 통해 3가지 큰 부류확인
# visualize all the training set predictions
# shap.plots.force(shap_values)
interpret_model(tuned_model, plot='reason')
모델은 아래와 같이 학습되었음 (실제 현상이 그런것이 아니라 모델이 그렇다는 것)
# SHAP - Corrleation
for col_i in get_config("X_train").columns:
shap.plots.scatter(shap_values[:, col_i], color=shap_values, alpha=0.5)
shap_values.shape
# visualize all the training set predictions
#shap.force_plot(explainer.expected_value,shap_values)
#shap.force_plot(explainer.expected_value, shap_values, get_config("X_train"))
interpret_model(tuned_model,plot='correlation',feature=get_config('X_train').columns[0])
# sklearn partial_dependency
'''
import matplotlib.pyplot as plt
#from sklearn.inspection import partial_dependence
#from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
plot_partial_dependence(model_top3[2], X=get_config('X_train'),
features=get_config('X_train').columns[1:2],
grid_resolution=20)
'''
# Simple PDP and ICE check
ref_data = get_config('X_train').copy()
ref_data = ref_data.reset_index(drop=True).reset_index()
ref_data_expand = pd.DataFrame()
target_col_i = ref_data.columns[2]
Xs_grid = np.linspace(ref_data[target_col_i].min(),
ref_data[target_col_i].max(),
num=20)
for grid_i in Xs_grid:
ref_data[target_col_i] = grid_i
ref_data_expand = pd.concat((ref_data_expand, ref_data), axis=0)
ref_data_expand['Y_hat'] = model_top3[0].predict_proba(ref_data_expand)[:, 1]
plt.figure(figsize=(6,4))
ax = plt.subplot(1, 1, 1)
ss = pd.cut(ref_data_expand['Age'], 20)
s = [(i.left + i.right) / 2 for i in ss.values]
ref_data_expand['Age'] = s
# PDP
ref_data_expand.groupby('Age')['Y_hat'].mean().plot(xlim=[0, 80],
ylim=[0.30, 0.70],
lw=5,
c='blue',
ax=ax)
# ICE
ref_data_expand.pivot_table(index='Age', columns='index',
values='Y_hat').plot(xlim=[0, 80],
ylim=[0, 1],
legend=[],
c='green',
lw=0.2,
ax=ax)
shap_interaction_values = explainer.shap_interaction_values(get_config('X_train'))
sns.heatmap(shap_interaction_values)
'''
explainer = shap.TreeExplainer(tuned_model)
explainer.shap_values(get_config('X_train')) # <- it always works
#shap_interaction_values = explainer.shap_interaction_values(get_config('X_train'))
'''
help(get_config())